Healthcare Index

Data Cleaning and Merge Datasets

We created a new dataset that contains the information for the Healthcare Index, Oil Production and Oil price. Index and Production variables have been transformed in order to have a closer scale to the Price variable.

{r}
# Read csv
df_healthcare_companies <- read.csv('../data/viz_healthcare_companies.csv')

# Filter data
df_healthcare_companies <- df_healthcare_companies %>% dplyr::select('date', 'TMO')

# Rename columns
names(df_healthcare_companies) <- c('date', 'adjusted')

# Change data type
df_healthcare_companies$date <- as.Date(df_healthcare_companies$date)

# Create a sequence of dates from start_date to end_date
start_date <- as.Date(min(df_healthcare_companies$date))  
end_date <- as.Date(max(df_healthcare_companies$date))   

# Create date sequence
date_range <- seq(start_date, end_date, by = "1 day")

# Create a dataset with the date range
date_dataset <- data.frame(Date = date_range)

# Merge dataframes
df_healthcare_companies <- merge(df_healthcare_companies, date_dataset, by.x = "date", by.y = "Date", all = TRUE)

# Check for missing values
# is.na(df_oil_price$adjusted)

# Extract rows with missing values
df_na_rows <- df_healthcare_companies[which(rowSums(is.na(df_healthcare_companies)) > 0),]

# Extract columns with missing values
df_na_cols <- df_healthcare_companies[, which(colSums(is.na(df_healthcare_companies)) > 0)]

# Modify data
imputed_time_series <- na_ma(df_healthcare_companies, k = 4, weighting = "exponential")

# Add modified data
df_healthcare_companies <- data.frame(imputed_time_series)

# Change data type
df_healthcare_companies$date <- as.Date(df_healthcare_companies$date,format = "%m/%d/%y")

# Create Date separte terms columns
df_healthcare_companies_monthly <- df_healthcare_companies %>%
  mutate(Year = lubridate::year(date),
         Month = lubridate::month(date),
         Day = lubridate::day(date))

# Group by Year Month and get the maximum day
df_healthcare_companies_monthly <- df_healthcare_companies_monthly %>%
  group_by(Year, Month) %>%
  summarize(Max_Day = max(Day))

# Create Date
df_healthcare_companies_monthly <- df_healthcare_companies_monthly %>%
  mutate(date = make_date(Year, Month, Max_Day))

# Merge datasets
df_healthcare_companies_monthly <- merge(df_healthcare_companies_monthly, df_healthcare_companies, by = "date", all.x = TRUE)

# Keep relevant columns
df_healthcare_companies_monthly <-  df_healthcare_companies_monthly %>% dplyr::select("date", "adjusted")

# Rename columns
names(df_healthcare_companies_monthly) <- c('Date', 'Index')

# Save ts as a new file
write.csv(df_healthcare_companies_monthly, '../data/df_healthcare_companies_monthly.csv', row.names = FALSE)

# ---------------------------------------------------------------------------------------------------------------------------------

# Import dataset
df_healthcare_companies_monthly <- as.data.frame(read_csv('../data/df_healthcare_companies_monthly.csv'))

# Filter information
df_healthcare_companies_monthly <- df_healthcare_companies_monthly %>% filter(year(Date) >= 2000 & year(Date) <= 2022)

# Create Date
df_healthcare_companies_monthly <- df_healthcare_companies_monthly %>%
  mutate(date2 = make_date(year(Date), month(Date), 01))

# Import dataset
df_oil_production <- as.data.frame(read_csv('../data/viz_us_oil_production.csv'))

# Filter information
df_oil_production <- df_oil_production %>% filter(year(Date) >= 2000 & year(Date) <= 2022)

# Create Date
df_oil_production <- df_oil_production %>%
  mutate(date2 = make_date(year(Date), month(Date), 01))

# Import dataset
df_oil_price <- as.data.frame(read_csv('../data/df_oil_price_monthly.csv'))

# Create Date
df_oil_price <- df_oil_price %>%
  mutate(date2 = make_date(year(date), month(date), 01))

df_healthcare_companies_monthly$date2 <- as.Date(df_healthcare_companies_monthly$date2)

df_oil_production$date2 <- as.Date(df_oil_production$date2)

df_oil_price$date2 <- as.Date(df_oil_price$date2)

# List of minimum dates
dates <- c(min(df_healthcare_companies_monthly$date2), min(df_oil_production$date2),min(df_oil_price$date2))

min_date <- max(dates)

# Filter starting date
df_healthcare_companies_monthly <- df_healthcare_companies_monthly %>% filter(date2 >= min_date)

# Filter starting date
df_oil_production <- df_oil_production %>% filter(date2 >= min_date)

# Filter starting date
df_oil_price <- df_oil_price %>% filter(date2 >= min_date)

# Keep relevant columns
df_healthcare_companies_monthly <- df_healthcare_companies_monthly %>% dplyr::select('date2', 'Index')

# Convert to Log
df_healthcare_companies_monthly$Index <- log(df_healthcare_companies_monthly$Index)

# Keep relevant columns
df_oil_production <- df_oil_production %>% dplyr::select('date2', 'Production')

# Convert to Log
df_oil_production$Production <- log(df_oil_production$Production)

# Keep relevant columns
df_oil_price <- df_oil_price %>% dplyr::select('date2', 'adjusted')



# List of dataframes
df_list <- list(df_healthcare_companies_monthly, df_oil_production, df_oil_price)

# Combine datasets
dd <- merge_dataframes(df_list)

# Rename columns
names(dd) <- c('DATE', 'Index', 'Production', 'Price') 

# Order by Date sort ascending
dd <- dd %>% dplyr::arrange(DATE)

# # Create the time series object
# dd.ts <- ts(dd,star=decimal_date(min_date),frequency = 12)

# Show table
knitr::kable(head(dd))
DATE Index Production Price
2000-08-01 2.942912 8.663715 1.0327613
2000-09-01 3.084701 8.658345 0.9740746
2000-10-01 3.169293 8.667164 1.0199688
2000-11-01 3.169293 8.671287 1.0546022
2000-12-01 3.178951 8.675051 0.8429909
2001-01-01 3.191460 8.665441 0.8954759

Plot

In the plot below we can observe the time series for the variables Index, Production and Price from 2020 to 2022.

{r}
plot <- ggplot(dd, aes(x = DATE)) +
  geom_line(aes(y = Index, color = "Index"), alpha = 0.7) +
  geom_line(aes(y = Production, color = "Production"), alpha = 0.7) +
  geom_line(aes(y = Price, color = "Price"), alpha = 0.7) +
  labs(title = "Healthcare Index ~ Oil Price + Oil Production",
       x = "Date",
       y = "Value",
       color = "Variables") +  # Set legend title
  scale_color_manual(values = c(Index = "red", Production = "blue", Price = "green")) +
  theme_minimal()

plot %>% ggplotly()

Linear Models

In FIT 1 we can observe the first linear model where the Index is the predictor variable and Oil Production and Oil Price are the independent variables. The variable Oil Price has a p-value higher than the significance level of 0.05, so it is not significant.

In FIT 2 we can see the second linear model where the Index is the predictor variable and Oil Production is the independent variable. The model has good R-squared and RMSE values.

FIT 1
FIT 2
{r}
fit_1 <- lm(Index ~ ., data = dd)

summary(fit_1)

Call:
lm(formula = Index ~ ., data = dd)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.49818 -0.13909 -0.06932  0.17576  0.51160 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -7.863e+00  8.388e-01  -9.375  < 2e-16 ***
DATE         3.373e-04  1.708e-05  19.752  < 2e-16 ***
Production   7.903e-01  1.180e-01   6.695 1.28e-10 ***
Price        2.301e-02  2.316e-02   0.994    0.321    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.2202 on 265 degrees of freedom
Multiple R-squared:  0.9574,    Adjusted R-squared:  0.9569 
F-statistic:  1985 on 3 and 265 DF,  p-value: < 2.2e-16
{r}
fit_2 <- lm(Index ~ Production, data = dd)

summary(fit_2)

Call:
lm(formula = Index ~ Production, data = dd)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.02182 -0.31398 -0.01122  0.35995  1.37957 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -22.06617    0.73678  -29.95   <2e-16 ***
Production    2.97392    0.08291   35.87   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.4405 on 267 degrees of freedom
Multiple R-squared:  0.8281,    Adjusted R-squared:  0.8275 
F-statistic:  1287 on 1 and 267 DF,  p-value: < 2.2e-16

R^2: 0.8281319 
RMSE: 0.43888

Stationarity

The ACF plot shows high autocorrelation in the lags, so the time series is not stationary. Evaluating the results of the Augmented Dickey-Fuller test, the p-value is higher than the significance level of 0.05. Therefore, we cannot reject the null hypothesis that the time series is stationary.

PLOTS
AUGMENTED DICKEY-FULLER TEST

Time Series, ACF Plot and PACF Plot

{r}
lm.residuals <- residuals(fit_2)

lm.residuals %>% ggtsdisplay()

Augmented Dickey-Fuller Test

{r}
adf_test <- adf.test(lm.residuals)

print(adf_test)

    Augmented Dickey-Fuller Test

data:  lm.residuals
Dickey-Fuller = -2.0076, Lag order = 6, p-value = 0.5728
alternative hypothesis: stationary

Differencing

We apply the first difference to the residuals of the linear model to obtain a stationary time series. We can observe the volatility of the residuals and two main volatility clusters in the data points near 100 and near 250.

In the acf plot, we can observe that the differenced residuals look stationary.

{r}
lm.residuals %>% diff() %>% ggtsdisplay()

Model Selection

Based on the ACF and PACF plot for the first difference on residuals, we define:

\(p = 2\)

\(d = 1\)

\(q = 2\)

Next we will run the model diagnostics on the models that have the minimum AIC, minimum BIC and the auto.arima() results.

MODEL PARAMETERS
MODEL SUMMARY
AUTO.ARIMA()

Model Parameters

{r}
xt <- lm.residuals

p_value <- 2

d_value <- 1

q_value <- 2

i <- 1

temp <- data.frame()

rows <- (p_value+1)*(d_value+1)*(q_value+1)

ls <- matrix(rep(NA,6*rows),nrow=rows) 

for (p in 0:p_value+1)
{
  for(q in 0:q_value+1)
  {
    for(d in 0:d_value)
    {
      
      if(p-1+d+q-1<=8) #usual threshold
      {
        
        model<- Arima(xt,order=c(p-1,d,q-1),include.drift=TRUE) 
        ls[i,]= c(p-1,d,q-1,model$aic,model$bic,model$aicc)
        i=i+1
        #print(i)
        
      }
      
    }
  }
}

temp <- as.data.frame(ls)

temp <- na.omit(temp)

names(temp) <- c("p","d","q","AIC","BIC","AICc")

#temp
knitr::kable(temp)
p d q AIC BIC AICc
0 0 0 279.53081 290.31494 279.62137
0 1 0 -335.92247 -328.74050 -335.87719
0 0 1 21.29335 35.67219 21.44486
0 1 1 -335.55719 -324.78423 -335.46628
0 0 2 -114.66152 -96.68796 -114.43338
0 1 2 -338.38902 -324.02507 -338.23693
1 0 0 -338.26285 -323.88401 -338.11134
1 1 0 -335.12381 -324.35085 -335.03290
1 0 1 -336.93043 -318.95688 -336.70230
1 1 1 -338.39144 -324.02749 -338.23935
1 0 2 -338.27166 -316.70339 -337.95105
1 1 2 -337.46060 -319.50566 -337.23159
2 0 0 -336.77331 -318.79975 -336.54517
2 1 0 -337.55096 -323.18701 -337.39886
2 0 1 -337.53053 -315.96226 -337.20992
2 1 1 -337.47950 -319.52457 -337.25049
2 0 2 -336.77357 -311.61059 -336.34445
2 1 2 -335.48229 -313.93637 -335.16045

Model Summary

{r}
temp[which.min(temp$AIC),]
   p d q       AIC       BIC      AICc
10 1 1 1 -338.3914 -324.0275 -338.2393

{r}
temp[which.min(temp$BIC),]
  p d q       AIC       BIC      AICc
2 0 1 0 -335.9225 -328.7405 -335.8772

{r}
temp[which.min(temp$AICc),]
   p d q       AIC       BIC      AICc
10 1 1 1 -338.3914 -324.0275 -338.2393

auto.arima()

{r}
# Assign the exogenous variable

fit_auto_arima <- auto.arima(xt)

summary(fit_auto_arima)
Series: xt 
ARIMA(0,1,2) 

Coefficients:
          ma1      ma2
      -0.0965  -0.1310
s.e.   0.0607   0.0599

sigma^2 = 0.01622:  log likelihood = 172.96
AIC=-339.91   AICc=-339.82   BIC=-329.14

Training set error measures:
                     ME    RMSE        MAE      MPE     MAPE     MASE
Training set 0.00548734 0.12666 0.08145436 77.30622 163.8885 0.991597
                    ACF1
Training set 0.007261496

Model Diagnostics

Comparing the results of the model diagnostics, we can observe that the ARIMA(0, 1, 2) model is the best. The lag values in the ACF plot are within the confidence bands, so the residuals are stationary. The standard residuals plotted in the Normal Q-Q plot show the similarity to normality. The p-values for the Ljung-Box statistic confirm that the residuals are stationary.

{r}
AIC <- temp[which.min(temp$AIC),]

p1 <- AIC$p
d1 <- AIC$d
q1 <- AIC$q

model_output <- capture.output(sarima(xt, p1, d1, q1))

{r}
BIC <- temp[which.min(temp$BIC),]

p2 <- BIC$p
d2 <- BIC$d
q2 <- BIC$q

model_output <- capture.output(sarima(xt, p2, d2, q2))

{r}
p3 <- 0
d3 <- 1
q3 <- 2

model_output <- capture.output(sarima(xt, p3, d3, q3))

Residuals

We plot the residuals and the residuals squared of the arima model using the ACF and PACF plot to determine the parameters for the GARCH model.

The values that will be used to assess the model are the following:

\(p = 1\)

\(q = 2\)

Residuals ACF

{r}
arima_model <- Arima(xt,order=c(p3, d3, q3),include.drift = TRUE)

arima.res <- residuals(arima_model)

acf(arima.res)

Residuals^2 ACF

{r}
acf(arima.res^2)

Residuals^2 PACF

{r}
pacf(arima.res^2)

GARCH Model

After running the GARCH models possibilities we observe that the model with the lowest AIC is GARCH(2, 1). The summary of the model shows that the beta1 component is not significant, therefore we decide to remove that component and test the GARCH(2, 0) model. For the second model, all the components are significant and the resulting AIC is lower compared to the AIC value of the GARCH(2, 1) model.

{r}
model <- list() ## set counter
cc <- 1
for (p in 1:2) {
  for (q in 1:2) {
  
model[[cc]] <- garch(arima.res,order=c(q,p),trace=F)
cc <- cc + 1
}
} 

GARCH_AIC <- sapply(model, AIC)

model[[which(GARCH_AIC == min(GARCH_AIC))]]

Call:
garch(x = arima.res, order = c(q, p), trace = F)

Coefficient(s):
      a0        a1        a2        b1  
0.006728  0.388092  0.280750  0.020268  
{r}
summary(garchFit(~garch(2,1), arima.res, trace = F))

Title:
 GARCH Modelling 

Call:
 garchFit(formula = ~garch(2, 1), data = arima.res, trace = F) 

Mean and Variance Equation:
 data ~ garch(2, 1)
<environment: 0x130e6cd18>
 [data = arima.res]

Conditional Distribution:
 norm 

Coefficient(s):
        mu       omega      alpha1      alpha2       beta1  
-0.0017586   0.0067031   0.3843606   0.2806766   0.0200725  

Std. Errors:
 based on Hessian 

Error Analysis:
        Estimate  Std. Error  t value Pr(>|t|)    
mu     -0.001759    0.006263   -0.281  0.77887    
omega   0.006703    0.001252    5.352 8.68e-08 ***
alpha1  0.384361    0.120258    3.196  0.00139 ** 
alpha2  0.280677    0.102399    2.741  0.00612 ** 
beta1   0.020073    0.067667    0.297  0.76674    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Log Likelihood:
 210.7997    normalized:  0.783642 

Description:
 Fri Dec  8 02:25:03 2023 by user:  


Standardised Residuals Tests:
                                  Statistic      p-Value
 Jarque-Bera Test   R    Chi^2  574.5326941 0.000000e+00
 Shapiro-Wilk Test  R    W        0.9136106 2.422078e-11
 Ljung-Box Test     R    Q(10)    6.9532841 7.298477e-01
 Ljung-Box Test     R    Q(15)   10.7342224 7.711973e-01
 Ljung-Box Test     R    Q(20)   12.1731000 9.099891e-01
 Ljung-Box Test     R^2  Q(10)    2.2231241 9.943263e-01
 Ljung-Box Test     R^2  Q(15)    3.0672555 9.995386e-01
 Ljung-Box Test     R^2  Q(20)    3.7192706 9.999745e-01
 LM Arch Test       R    TR^2     2.2067775 9.990169e-01

Information Criterion Statistics:
      AIC       BIC       SIC      HQIC 
-1.530109 -1.463293 -1.530784 -1.503276 
{r}
summary(garchFit(~garch(2,0), arima.res, trace = F))

Title:
 GARCH Modelling 

Call:
 garchFit(formula = ~garch(2, 0), data = arima.res, trace = F) 

Mean and Variance Equation:
 data ~ garch(2, 0)
<environment: 0x1374deac8>
 [data = arima.res]

Conditional Distribution:
 norm 

Coefficient(s):
        mu       omega      alpha1      alpha2  
-0.0017586   0.0069147   0.3836984   0.2883843  

Std. Errors:
 based on Hessian 

Error Analysis:
        Estimate  Std. Error  t value Pr(>|t|)    
mu     -0.001759    0.006278   -0.280  0.77938    
omega   0.006915    0.001099    6.292 3.14e-10 ***
alpha1  0.383698    0.120256    3.191  0.00142 ** 
alpha2  0.288384    0.097400    2.961  0.00307 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Log Likelihood:
 210.7456    normalized:  0.783441 

Description:
 Fri Dec  8 02:25:03 2023 by user:  


Standardised Residuals Tests:
                                  Statistic      p-Value
 Jarque-Bera Test   R    Chi^2  565.2641722 0.000000e+00
 Shapiro-Wilk Test  R    W        0.9143988 2.796152e-11
 Ljung-Box Test     R    Q(10)    7.0370496 7.219427e-01
 Ljung-Box Test     R    Q(15)   10.7816832 7.679114e-01
 Ljung-Box Test     R    Q(20)   12.2495164 9.072201e-01
 Ljung-Box Test     R^2  Q(10)    2.1211064 9.953248e-01
 Ljung-Box Test     R^2  Q(15)    2.9819760 9.996126e-01
 Ljung-Box Test     R^2  Q(20)    3.6504702 9.999782e-01
 LM Arch Test       R    TR^2     2.1341298 9.991709e-01

Information Criterion Statistics:
      AIC       BIC       SIC      HQIC 
-1.537142 -1.483689 -1.537576 -1.515675 

Model Equation

\(Y_{t} = \mu - 0.00067 + \omega \times 0.00175 + \alpha_1 \times \varepsilon_{t-1}^2 + \alpha_2 \times \varepsilon_{t-2}^2 + \varepsilon_t\)

\(\mu - 0.00067 + \omega \times 0.00175 + \alpha_1 \times 0.62634 + \alpha_2 \times 0.36708\)

Forecast Plot

Below we can observe the Forcast Plot with the predictions from the ARIMA(0, 1, 2) + GARCH(2, 0) model.

{r}
final.fit <- garchFit(~garch(2,0), arima.res, trace = F)

plot <- predict(final.fit, n.ahead = 100, plot=TRUE)